/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.db;
import java.io.*;
import java.util.*;
import java.nio.channels.*;
import net.nutch.io.*;
import net.nutch.pagedb.*;
import net.nutch.linkdb.*;
/**********************************************
* The WebDBReader implements all the read-only
* parts of accessing our web database.
* All the writing ones can be found in WebDBWriter.
*
* @author Mike Cafarella
**********************************************/
public class WebDBReader implements IWebDBReader {
static final Page[] PAGE_RECORDS = new Page[0];
static final Link[] LINK_RECORDS = new Link[0];
// filenames
static final String PAGES_BY_URL = "pagesByURL";
static final String PAGES_BY_MD5 = "pagesByMD5";
static final String LINKS_BY_URL = "linksByURL";
static final String LINKS_BY_MD5 = "linksByMD5";
static final String STATS_FILE = "stats";
File dbFile;
MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5;
long totalPages = 0, totalLinks = 0;
Vector mapReaders = null, setReaders = null;
FileInputStream dbReadLockData;
FileLock dbReadLock;
/**
* Open a web db reader for the named directory.
*/
public WebDBReader(File dir) throws IOException, FileNotFoundException {
this.dbFile = new File(dir, "webdb");
// Obtain read lock on db so writers don't try to
// move it out from under us. This obtains a non-exclusive
// lock on the directory that holds the dbs (old and new)
File readLockFile = new File(dir, "dbreadlock");
readLockFile.createNewFile();
this.dbReadLockData = new FileInputStream(readLockFile);
this.dbReadLock = dbReadLockData.getChannel().lock(0L, Long.MAX_VALUE, true);
// Create tables
this.pagesByURL = new MapFile.Reader(new File(dbFile, PAGES_BY_URL).getPath(), new UTF8.Comparator());
this.pagesByMD5 = new MapFile.Reader(new File(dbFile, PAGES_BY_MD5).getPath(), new Page.Comparator());
this.linksByURL = new MapFile.Reader(new File(dbFile, LINKS_BY_URL).getPath(), new Link.UrlComparator());
this.linksByMD5 = new MapFile.Reader(new File(dbFile, LINKS_BY_MD5).getPath(), new Link.MD5Comparator());
// Load in statistics
File stats = new File(dbFile, STATS_FILE);
if (stats.exists()) {
DataInputStream in = new DataInputStream(new FileInputStream(stats));
try {
int version = (byte) in.read();
this.totalPages = in.readLong();
this.totalLinks = in.readLong();
} finally {
in.close();
}
}
// Create vectors so we can GC readers used by
// enum() calls. We do this so we can have multiple
// simultaneous enum users. However, since we keep
// a handle to each one, we're assuming that we don't
// create too many before WebDBReader.close() is called.
this.mapReaders = new Vector();
this.setReaders = new Vector();
}
/**
* Shutdown
*/
public void close() throws IOException {
pagesByURL.close();
pagesByMD5.close();
linksByURL.close();
linksByMD5.close();
for (Enumeration e = mapReaders.elements(); e.hasMoreElements(); ) {
MapFile.Reader tmp = (MapFile.Reader) e.nextElement();
tmp.close();
}
for (Enumeration e = setReaders.elements(); e.hasMoreElements(); ) {
SetFile.Reader tmp = (SetFile.Reader) e.nextElement();
tmp.close();
}
// release the lock
dbReadLock.release();
dbReadLockData.close();
}
/**
* Get Page from the pagedb with the given URL
*/
public Page getPage(String url) throws IOException {
return (Page) pagesByURL.get(new UTF8(url), new Page());
}
/**
* Get Pages from the pagedb according to their
* content hash.
*/
public Page[] getPages(MD5Hash md5) throws IOException {
Vector records = new Vector(3);
Page p = new Page();
p.getMD5().set(md5);
pagesByMD5.seek(p);
while (pagesByMD5.next(p, NullWritable.get())) {
if (p.getMD5().compareTo(md5) == 0) {
records.add(p);
p = new Page();
} else {
break;
}
}
// Xfer from the vector into an array
return (Page[]) records.toArray(PAGE_RECORDS);
}
/**
* Test whether a certain piece of content is in the
* database, but don't bother returning the Page(s) itself.
*/
public boolean pageExists(MD5Hash md5) throws IOException {
Page p = new Page();
p.getMD5().set(md5);
pagesByMD5.seek(p);
if (pagesByMD5.next(p, NullWritable.get()) && p.getMD5().compareTo(md5) == 0) {
return true;
} else {
return false;
}
}
/**
* Iterate through all the Pages, sorted by URL
*/
public Enumeration pages() throws IOException {
MapFile.Reader tmpReader = new MapFile.Reader(new File(dbFile, "pagesByURL").getPath());
mapReaders.add(tmpReader);
return new TableEnumerator(tmpReader);
}
//
// The TableEnumerator goes through all the entries
// in the Table (which is a MapFile).
//
class TableEnumerator implements Enumeration {
MapFile.Reader reader;
Page nextItem;
/**
* Start the cursor and find the first item.
* Store it for later return.
*/
public TableEnumerator(MapFile.Reader reader) {
this.reader = reader;
this.nextItem = new Page();
try {
if (! reader.next(new UTF8(), this.nextItem)) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
}
/**
* If there's no item left in store, we've hit the end.
*/
public boolean hasMoreElements() {
return (nextItem != null);
}
/**
* Set aside the item we have in store. Then retrieve
* another for the next time we're called. Finally, return
* the set-aside item.
*/
public Object nextElement() {
if (nextItem == null) {
throw new NoSuchElementException("PageDB Enumeration");
}
Page toReturn = nextItem;
this.nextItem = new Page();
try {
if (! reader.next(new UTF8(), nextItem)) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
return toReturn;
}
}
/**
* Iterate through all the Pages, sorted by MD5
*/
public Enumeration pagesByMD5() throws IOException {
SetFile.Reader tmpReader = new SetFile.Reader(new File(dbFile, "pagesByMD5").getPath());
setReaders.add(tmpReader);
return new IndexEnumerator(tmpReader);
}
/**
* Return the number of pages we're dealing with
*/
public long numPages() {
return totalPages;
}
//
// The IndexEnumerator goes through all the entries
// in the index (which is a SequenceFile).
//
class IndexEnumerator implements Enumeration {
SetFile.Reader reader;
Page nextItem;
/**
* Start the cursor and find the first item.
* Store it for later return.
*/
public IndexEnumerator(SetFile.Reader reader) {
this.reader = reader;
this.nextItem = new Page();
try {
if (! reader.next(nextItem)) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
}
/**
* If there's no item left in store, we've hit the end.
*/
public boolean hasMoreElements() {
return (nextItem != null);
}
/**
* Set aside the item we have in store. Then retrieve
* another for the next time we're called. Finally, return
* the set-aside item.
*/
public Object nextElement() {
if (nextItem == null) {
throw new NoSuchElementException("PageDB Enumeration");
}
Page toReturn = nextItem;
this.nextItem = new Page();
try {
if (! reader.next(nextItem)) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
return toReturn;
}
}
/**
* Get all the hyperlinks that link TO the indicated URL.
*/
public Link[] getLinks(UTF8 url) throws IOException {
Vector records = new Vector(3);
Link l = new Link();
l.getURL().set(url);
linksByURL.seek(l);
while (linksByURL.next(l, NullWritable.get())) {
if (url.equals(l.getURL())) {
records.add(l);
l = new Link();
} else {
break;
}
}
// Xfer from the vector into an array
return (Link[]) records.toArray(LINK_RECORDS);
}
/**
* Grab all the links from the given MD5 hash.
*/
public Link[] getLinks(MD5Hash md5) throws IOException {
Vector records = new Vector(3);
Link l = new Link();
l.getFromID().set(md5);
linksByMD5.seek(l);
while (linksByMD5.next(l, NullWritable.get())) {
if (md5.equals(l.getFromID())) {
records.add(l);
l = new Link();
} else {
break;
}
}
// Xfer from the vector into an array
return (Link[]) records.toArray(LINK_RECORDS);
}
/**
* Return all the links, by target URL
*/
public Enumeration links() {
return new MapEnumerator(linksByURL);
}
/**
* Return the number of links in our db.
*/
public long numLinks() {
return totalLinks;
}
//
// Here's the class for the above function
//
class MapEnumerator implements Enumeration {
MapFile.Reader reader;
Link nextItem;
/**
* Start the cursor and find the first item.
* Store it for later return.
*/
public MapEnumerator(MapFile.Reader reader) {
this.reader = reader;
this.nextItem = new Link();
try {
if (! reader.next(this.nextItem, NullWritable.get())) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
}
/**
* If there's no item left in store, we've hit the end.
*/
public boolean hasMoreElements() {
return (nextItem != null);
}
/**
* Set aside the item we have in store. Then retrieve
* another for the next time we're called. Finally, return
* the set-aside item.
*/
public Object nextElement() {
if (nextItem == null) {
throw new NoSuchElementException("PageDB Enumeration");
}
Link toReturn = nextItem;
this.nextItem = new Link();
try {
if (! reader.next(nextItem, NullWritable.get())) {
this.nextItem = null;
}
} catch (IOException ie) {
this.nextItem = null;
}
return toReturn;
}
}
/**
* The WebDBReader.main() provides some handy utility methods
* for looking through the contents of the webdb. Hoo-boy!
*/
public static void main(String argv[]) throws FileNotFoundException, IOException {
if (argv.length < 2) {
System.out.println("Usage: java net.nutch.db.WebDBReader <db> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
return;
}
WebDBReader reader = new WebDBReader(new File(argv[0]));
try {
if ("-pageurl".equals(argv[1])) {
String url = argv[2];
System.out.println(reader.getPage(url.trim()));
} else if ("-pagemd5".equals(argv[1])) {
MD5Hash md5 = new MD5Hash(argv[2]);
Page pages[] = reader.getPages(md5);
System.out.println("Found " + pages.length + " pages.");
for (int i = 0; i < pages.length; i++) {
System.out.println("Page " + i + ": " + pages[i]);
}
} else if ("-dumppageurl".equals(argv[1])) {
System.out.println(reader);
System.out.println();
int i = 1;
for (Enumeration e = reader.pages(); e.hasMoreElements(); i++) {
Page page = (Page) e.nextElement();
System.out.println("Page " + i + ": " + page);
System.out.println();
}
} else if ("-dumppagemd5".equals(argv[1])) {
System.out.println(reader);
System.out.println();
int i = 1;
for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); i++) {
Page page = (Page) e.nextElement();
System.out.println("Page " + i + ": " + page);
System.out.println();
}
} else if ("-toppages".equals(argv[1])) {
int topSize = Integer.parseInt(argv[2]);
// Create a sorted list
SortedSet topSet = new TreeSet(new Comparator() {
public int compare(Object o1, Object o2) {
Page p1 = (Page) o1;
Page p2 = (Page) o2;
if (p1.getScore() < p2.getScore()) {
return -1;
} else if (p1.getScore() == p2.getScore()) {
// If two scores are equal, we will
// use regular Page comparison (which
// uses URL as the primary key). We
// don't want to uniquify by score!
return p1.compareTo(p2);
} else {
return 1;
}
}
}
);
// Find the top "topSize" elts
Page lowestPage = null;
for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
Page curPage = (Page) e.nextElement();
if (topSet.size() < topSize) {
topSet.add(curPage);
lowestPage = (Page) topSet.first();
} else if (lowestPage.getScore() < curPage.getScore()) {
topSet.remove(lowestPage);
topSet.add(curPage);
lowestPage = (Page) topSet.first();
}
}
// Print them out
int i = 0;
for (Iterator it = topSet.iterator(); it.hasNext(); i++) {
System.out.println("Page " + i + ": " + (Page) it.next());
System.out.println();
}
} else if ("-linkurl".equals(argv[1])) {
String url = argv[2];
Link links[] = reader.getLinks(new UTF8(url.trim()));
System.out.println("Found " + links.length + " links.");
for (int i = 0; i < links.length; i++) {
System.out.println("Link " + i + ": " + links[i]);
}
} else if ("-linkmd5".equals(argv[1])) {
MD5Hash fromID = new MD5Hash(argv[2]);
Link links[] = reader.getLinks(fromID);
System.out.println("Found " + links.length + " links.");
for (int i = 0; i < links.length; i++) {
System.out.println("Link " + i + ": " + links[i]);
}
} else if ("-dumplinks".equals(argv[1])) {
System.out.println(reader);
System.out.println();
Enumeration e = reader.pagesByMD5();
while (e.hasMoreElements()) {
Page page = (Page) e.nextElement();
Link[] links = reader.getLinks(page.getMD5());
if (links.length > 0) {
System.out.println("from " + page.getURL());
for (int i = 0; i < links.length; i++) {
System.out.println(" to " + links[i].getURL());
}
System.out.println();
}
}
} else if ("-stats".equals(argv[1])) {
System.out.println("Stats for " + reader);
System.out.println("-------------------------------");
System.out.println("Number of pages: " + reader.numPages());
System.out.println("Number of links: " + reader.numLinks());
} else {
System.out.println("Sorry, no command with name " + argv[1]);
}
} finally {
reader.close();
}
}
}